Hive 行列转换、HiveWC、配置目录

Hive列转行

create database if not exists company;

create table company(
name string,
deptno string,
grade string
) row format delimited fields terminated by ',';

1	load data local inpath '/home/hadoop/data/hive/rowtocol.txt' into table company;

列转行表数据

company.name	company.deptno	company.grade
tome			deptno01			A
jack			deptno01			A
apple			deptno02			A
banan			deptno02			A
pair			deptno03			C
double			deptno04			S
Time taken: 0.547 seconds, Fetched: 6 row(s)

列转行SQL语句

select t.dept_grade,concat_ws("|",collect_set(t.name)) 
from (
select name, concat(deptno,",",grade) as dept_grade 
from company)t 
group by t.dept_grade

t.dept_grade	_c1
deptno01,A		tome|jack
deptno02,A		apple|banan
deptno03,C		pair
deptno04,S		double
Time taken: 36.858 seconds, Fetched: 4 row(s)

Hive行转列

数据文件

1,tom,HuaXue:Physical:Math:Chinese
2,jack,HuaXue:Animal:Computer:Java
3,john,ZheXue:ZhengZhi:SiXiu:history
4,alice,C++:Linux:Hadoop:Flink

创建表

create table score(
id int,
name string,
subject array<string>
) row format delimited fields terminated by ',' 
COLLECTION ITEMS TERMINATED BY ':';

1	load data local inpath '/home/hadoop/data/hive/coltorow.txt' into table score;

score.id	score.name	score.subject
1			tom			["HuaXue","Physical","Math","Chinese"]
2			jack		["HuaXue","Animal","Computer","Java"]
3			john		["ZheXue","ZhengZhi","SiXiu","history"]
4			alice		["C++","Linux","Hadoop","Flink"]

SQL语句

1 2	select id,name,sub from score lateral view explode(subject) tmp as sub;

hive (company)> select id,name,sub from 
              > score lateral view explode(subject) tmp as sub;
OK
id	name	sub
1	tom	    HuaXue
1	tom	    Physical
1	tom	    Math
1	tom	    Chinese
2	jack	HuaXue
2	jack	Animal
2	jack	Computer
2	jack	Java
3	john	ZheXue
3	john	ZhengZhi
3	john	SiXiu
3	john	history
4	alice	C++
4	alice	Linux
4	alice	Hadoop
4	alice	Flink
Time taken: 0.163 seconds, Fetched: 16 row(s)

Hive的WordCount

创建表

1
2
3

create table wc(
word string
)row format delimited fields terminated by '\r\n';

加载数据

1	load data local inpath '/home/hadoop/data/hive/wc.txt' into table wc;

select split(word,',') from wc;

select explode(split(word,',')) word from wc

select word ,count(1) coun
from (
		select explode(split(word,',')) word 
		from wc)t 
group by word ;

word	coun
flink	1
hadoop	3
hive	3
spark	2
Time taken: 84.445 seconds, Fetched: 4 row(s)

查看Hive的元数据信息

1	mysql> select * from dbs \G;

*************************** 1. row ***************************
          DB_ID: 1
           DESC: Default Hive database
DB_LOCATION_URI: hdfs://bigdata01:9000/user/hive/warehouse
           NAME: default
     OWNER_NAME: public
     OWNER_TYPE: ROLE
*************************** 2. row ***************************
          DB_ID: 2
           DESC: this is jackson database
DB_LOCATION_URI: hdfs://bigdata01:9000/bigdata_hive/hive_test
           NAME: bigdata_hive
     OWNER_NAME: hadoop
     OWNER_TYPE: USER
*************************** 3. row ***************************
          DB_ID: 7
           DESC: NULL
DB_LOCATION_URI: hdfs://bigdata01:9000/user/hive/warehouse/ruozedata_erp.db
           NAME: ruozedata_erp
     OWNER_NAME: hadoop
     OWNER_TYPE: USER
3 rows in set (0.06 sec)

Hive的Metatool

[hadoop@bigdata01 hive]$ bin/metatool --help
usage: metatool
 -dryRun                               Perform a dry run of updateLocation
                                       changes.When run with the dryRun
                                       option updateLocation changes are
                                       displayed but not persisted. dryRun
                                       is valid only with the
                                       updateLocation option.
 -executeJDOQL <query-string>          execute the given JDOQL query
 -help                                 print this message
 -listFSRoot                           print the current FS root locations
 -serdePropKey <serde-prop-key>        Specify the key for serde property
                                       to be updated. serdePropKey option
                                       is valid only with updateLocation
                                       option.
 -tablePropKey <table-prop-key>        Specify the key for table property
                                       to be updated. tablePropKey option
                                       is valid only with updateLocation
                                       option.
 -updateLocation <new-loc> <old-loc>   Update FS root location in the
                                       metastore to new location.Both
                                       new-loc and old-loc should be valid
                                       URIs with valid host names and
                                       schemes.When run with the dryRun
                                       option changes are displayed but
                                       are not persisted. When run with
                                       the serdepropKey/tablePropKey
                                       option updateLocation looks for the
                                       serde-prop-key/table-prop-key that
                                       is specified and updates its value
                                       if found.

[hadoop@bigdata01 hive]$ metatool -listFSRoot
20/01/27 11:05:52 INFO metastore.ObjectStore: Initialized ObjectStore
Listing FS Roots..
hdfs://bigdata01:9000/user/hive/warehouse Hive在HDFS上面默认的目录
hdfs://bigdata01:9000/user/hive/warehouse/company.db		其他的Hive数据库的目录
hdfs://bigdata01:9000/user/hive/warehouse/ruozedata_erp.db  其他的Hive数据库的目录

Hive中多文件夹递归共计wordcount，文件存储在Hadoop HDFS上面
准备数据文件：
[hadoop@bigdata01 hive]$ hadoop fs -ls /hive/mutli_dir
20/01/27 11:32:26 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
drwxr-xr-x   - hadoop supergroup          0 2020-01-27 11:31 /hive/mutli_dir/dir_2
-rw-r--r--   1 hadoop supergroup         42 2020-01-27 11:30 /hive/mutli_dir/word.txt

[hadoop@bigdata01 hive]$ hadoop fs -ls /hive/mutli_dir/dir_2
20/01/27 11:32:55 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 1 items
-rw-r--r--   1 hadoop supergroup         42 2020-01-27 11:31 /hive/mutli_dir/dir_2/word.txt

[hadoop@bigdata01 hive]$ hadoop fs -cat /hive/mutli_dir/dir_2/word.txt
20/01/27 11:33:33 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
hadoop,flume
hive,hive,flink
spark,hbase,

Hive配置多目录输入

wordcount程序，输入为多目录时候的设置

hadoop 自带的wordcount程序

hadoop jar /home/hadoop/app/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.6.0-cdh5.16.2.jar wordcount /hive/mutli_dir   /hive/out1

报错：
Caused by: org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): Path is not a file: /hive/mutli_dir/dir_2
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:70)
	at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:56)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocationsInt(FSNamesystem.java:2157)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2127)
	at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:2040)
	at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:583)
	at org.apache.hadoop.hdfs.server.namenode.AuthorizationProviderProxyClientProtocol.getBlockLocations(AuthorizationProviderProxyClientProtocol.java:94)
	at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:377)
	at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
	at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:617)
	at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1073)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2278)
	at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2274)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1924)
	at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2272)

修改mapred-site.xml 配置信息

<property>
  <name>mapreduce.input.fileinputformat.input.dir.recursive</name>
  <value>true</value>
</property>

Hive列转行

Hive行转列

Hive的WordCount

Hive的Metatool

Hive配置多目录输入

FEATURED TAGS

FRIENDS